# How soccer became a global sport: where did it start and what changed as more teams were starting to compete.
# Which countries have dominated the different eras of soccer since everything started.
# Cleaning, processing and first exploration
# As seen below, this data set consists of (supposedly) all games since the inaugural Scotland - England in 1872.
# For each game, we have the score, the tournament, the host city and country.
### Loading libraries
library(ggplot2) # Data visualization
## Warning: package 'ggplot2' was built under R version 4.2.2
library(readr) # CSV file I/O, e.g. the read_csv function
## Warning: package 'readr' was built under R version 4.2.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.2.2
# Reading input file.
df <- read_csv("C://Users//Nishtha//Documents//bhavuk//Semester 6//DV//J Comp//results.csv")
## Rows: 44353 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): home_team, away_team, tournament, city, country
## dbl (2): home_score, away_score
## lgl (1): neutral
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 9
## date home_team away_team home_sc…¹ away_…² tourn…³ city country neutral
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <lgl>
## 1 1872-11-30 Scotland England 0 0 Friend… Glas… Scotla… FALSE
## 2 1873-03-08 England Scotland 4 2 Friend… Lond… England FALSE
## 3 1874-03-07 Scotland England 2 1 Friend… Glas… Scotla… FALSE
## 4 1875-03-06 England Scotland 2 2 Friend… Lond… England FALSE
## 5 1876-03-04 Scotland England 3 0 Friend… Glas… Scotla… FALSE
## 6 1876-03-25 Scotland Wales 4 0 Friend… Glas… Scotla… FALSE
## # … with abbreviated variable names ¹home_score, ²away_score, ³tournament
# Let's check if we hace some NA or NULL values we should clean.
# Apparently not. Good news, let's continue.
apply(df, 2, function(v) {length(which(is.na(v) | is.null(v)))})
## date home_team away_team home_score away_score tournament city
## 0 0 0 0 0 0 0
## country neutral
## 0 0
# Let's process a bit the data so that we can have a quicker access to some important feature such as the result or the names of the winning or losing team. The outcome of a game will be encoded as D for draw, H for the home team winning and A for the away team winning. We will also extract some date-related features such as the day of week or month.
game_outcome <- function(home_score, away_score) {
outcome <- "D"
if (home_score > away_score) {outcome <- "H"}
if (home_score < away_score) {outcome <- "A"}
return(outcome)
}
winning_team <- function(home_score, away_score, home_team, away_team) {
winning_team <- NA
if (home_score > away_score) {winning_team <- home_team}
if (home_score < away_score) {winning_team <- away_team}
return(winning_team)
}
losing_team <- function(home_score, away_score, home_team, away_team) {
losing_team <- NA
if (home_score < away_score) {losing_team <- home_team}
if (home_score > away_score) {losing_team <- away_team}
return(losing_team)
}
df <- df %>%
mutate(year = format(date, "%Y"),
month = format(date, "%b"),
dayofweek = weekdays(date)) %>%
rowwise() %>%
mutate(outcome = game_outcome(home_score, away_score),
winning_team = winning_team(home_score, away_score, home_team, away_team),
losing_team = losing_team(home_score, away_score, home_team, away_team)) %>%
ungroup()
head(df)
## # A tibble: 6 × 15
## date home_…¹ away_…² home_…³ away_…⁴ tourn…⁵ city country neutral year
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <lgl> <chr>
## 1 1872-11-30 Scotla… England 0 0 Friend… Glas… Scotla… FALSE 1872
## 2 1873-03-08 England Scotla… 4 2 Friend… Lond… England FALSE 1873
## 3 1874-03-07 Scotla… England 2 1 Friend… Glas… Scotla… FALSE 1874
## 4 1875-03-06 England Scotla… 2 2 Friend… Lond… England FALSE 1875
## 5 1876-03-04 Scotla… England 3 0 Friend… Glas… Scotla… FALSE 1876
## 6 1876-03-25 Scotla… Wales 4 0 Friend… Glas… Scotla… FALSE 1876
## # … with 5 more variables: month <chr>, dayofweek <chr>, outcome <chr>,
## # winning_team <chr>, losing_team <chr>, and abbreviated variable names
## # ¹home_team, ²away_team, ³home_score, ⁴away_score, ⁵tournament
# Now, let's do some basic exploration. How many entries? Answer > 38k matches.
dim(df)
## [1] 44353 15
# A journey through the historical landscape of international soccer
# Which teams play the most?
# Let's start by checking which are the most represented teams? This will tell us which are the team with the richest history.
# Surprisingly, Sweden is the team who has played the most games. Most top 10 countries are major soccer nation such as Brazil, Argentina, England, Germany or France. Countries such as Ururguay, Mexico and Hungary are also old teams as they participated to the first world cups (1930 and/or 1934).
all_teams <- data.frame(teams = c(df$home_team, df$away_team), year=as.numeric(c(df$year, df$year)))
all_teams_count <- all_teams %>%
group_by(teams) %>%
summarise(number_games = length(teams)) %>%
arrange(desc(number_games))
head(all_teams_count, 10)
## # A tibble: 10 × 2
## teams number_games
## <chr> <int>
## 1 Sweden 1053
## 2 England 1049
## 3 Brazil 1021
## 4 Argentina 1018
## 5 Germany 986
## 6 Hungary 966
## 7 Mexico 935
## 8 Uruguay 919
## 9 South Korea 905
## 10 France 880
# It is likely all these teams have a different trajectory, some might have start playing earlier and some later. The plot below displays the cumulative sum of the number of matches for these top 10 teams. Hover the line to display the name of the team. You can also click on a team's name to hide/show it.
top_teams_games_per_year <- all_teams %>%
filter(teams %in% head(all_teams_count, 10)$teams & year < 2018) %>%
group_by(teams, year) %>%
summarise(nb_games = length(year)) %>%
mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
library(plotly)
top_teams_games_per_year <- top_teams_games_per_year %>%
arrange(teams, year) %>%
group_by(teams) %>%
mutate(cumsum=cumsum(nb_games))
p <- ggplot(top_teams_games_per_year, aes(x=year_date, y=cumsum, colour=teams, group=teams)) +
geom_line() +
labs(x="Year", y="Cumulated number of games", title="Top 10 teams in total number of games", colour="Click on a team \nto hide/show it")
ggplotly(p)
# The 10 most active teams indded have different trajectories. England gets its second positopm thanks to the many games they played in the 19th century. Some countries such as Sweden, France or Hungary have a more steady progression while teams like Korea or Mexico join the top 10 thanks to their recent hyper activity (Korea's first official games were just before 1950).
# How many games per year?
# Let's now check how many games were played each year and how the total number of international games evolve with time.
tmp <- df %>%
filter(year < 2018) %>%
mutate(year = as.numeric(year)) %>%
group_by(year) %>%
summarise(nb_games = length(date)) %>%
ungroup()
ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
geom_line() +
labs(x="Year", title="Number of international soccer games", y="") +
scale_x_continuous(breaks=seq(1870, 2020, 10))

# There are few interestings things going on here:
# * Number of games is rising, with high growth in the 80s/90s.
# * It seems there is a peak around 2010, with a slight decrease since.
# * We see a drop during world wars.
# * Since the 80s, data is very spiky, likely due to the absence/presence of world cups or other events.
#
# Let's try to visualise this to add some understanding to our plot.
wc_years <- c(1930, 1934, 1938, seq(1950, 2014, 4))
tmp <- tmp %>%
mutate(is_wc = year %in% wc_years)
ggplot(tmp, aes(x=year, y=nb_games, group=1)) +
geom_line() +
geom_point(data = tmp %>% filter(is_wc), aes(colour=is_wc)) +
labs(x="Year", title="Number of international soccer games", y="", colour="World cup year") +
geom_vline(xintercept=c(1914,1918,1939,1945), lwd=0.3, colour="gray80") +
scale_x_continuous(breaks=seq(1870, 2020, 10))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# The two main drops indeed correspond to the 2 world wars but, surprisingly, the world cup years are those counting less matches.
# Let's investigate which are the most common game types and competitions every year, since 2000.
df_competitions <- df %>%
group_by(tournament, year) %>%
summarise(nb_games = length(date))
## `summarise()` has grouped output by 'tournament'. You can override using the
## `.groups` argument.
ggplot(df_competitions %>% filter(year >= 2000 & year < 2018),
aes(x=year, y=nb_games, fill=tournament)) +
geom_bar(stat="identity") +
guides(fill=FALSE) +
labs(x="Year", y="Number of games")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# We can see that some events/tournaments are more frequent on non-world cup years such as 2007 or 2011. Let's check what they are.
df_competitions %>% filter(year == 2011) %>% arrange(desc(nb_games))
## # A tibble: 22 × 3
## # Groups: tournament [22]
## tournament year nb_games
## <chr> <chr> <int>
## 1 Friendly 2011 379
## 2 FIFA World Cup qualification 2011 216
## 3 UEFA Euro qualification 2011 154
## 4 African Cup of Nations qualification 2011 77
## 5 AFC Asian Cup 2011 32
## 6 AFC Challenge Cup qualification 2011 29
## 7 Island Games 2011 29
## 8 Pacific Games 2011 29
## 9 CECAFA Cup 2011 26
## 10 Copa América 2011 26
## # … with 12 more rows
df_competitions %>% filter(year == 2010) %>% arrange(desc(nb_games))
## # A tibble: 21 × 3
## # Groups: tournament [21]
## tournament year nb_games
## <chr> <chr> <int>
## 1 Friendly 2010 423
## 2 UEFA Euro qualification 2010 94
## 3 FIFA World Cup 2010 64
## 4 African Cup of Nations qualification 2010 48
## 5 CFU Caribbean Cup qualification 2010 34
## 6 African Cup of Nations 2010 29
## 7 AFF Championship 2010 24
## 8 AFC Asian Cup qualification 2010 19
## 9 CECAFA Cup 2010 18
## 10 CFU Caribbean Cup 2010 16
## # … with 11 more rows
# World cup qualifications generates much more matches than the world cup itself, which makes sense as the World Cup only concerns 32 countries. This is well shown in the two plost below: there is no WC qualification matches during a World Cup year and the number of qualification matches is greater than then number of WC matches by a factor 3 to 7 in general.
df_competition_filtered <- df_competitions %>%
filter(year >= 2006 & year < 2018 & tournament %in% c("Friendly","UEFA Euro qualification","FIFA World Cup", "FIFA World Cup qualification", "African Cup of Nations qualification"))
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, colour=tournament)) +
geom_point() +
geom_line() +
labs(x="Year", y="Nb games", colour="Competition")

# %% [code]
ggplot(df_competition_filtered, aes(x=year, y=nb_games, group=tournament, fill=tournament)) +
geom_bar(stat="identity") +
labs(x="Year", y="Nb games", fill="Competition")

# Worldwide soccer adoption
# When did soccer start to be widely played, i.e. when do most nations start playing international games? The plot below teaches us several things:
#
# * The number of teams steadily increased 1902 and this increase accelerated up to 1920.
# * From there, the pace of addition of new teams increase much faster and stalls abit around the late 40's
# * Then we see a steady and rapid growth up to the mid 1990's.
df_teams_start <- all_teams %>%
mutate(year = as.numeric(year)) %>%
group_by(teams) %>%
summarise(first_game = min(year))
df_year_teams_start <- df_teams_start %>%
group_by(first_game) %>%
summarise(n = length(teams)) %>%
arrange(first_game) %>%
mutate(cumsum = cumsum(n))
ggplot(df_year_teams_start, aes(x=first_game, y=cumsum)) +
geom_line() +
scale_x_continuous(breaks = seq(1870,2020, 10)) +
labs(x="Year", title="Cumulative sum of number of international soccer teams", y="")

# Which were the first and last teams to join?
# The four first teams to compete in international games were from what is now forming UK. Soccer then crossed the pond and teams such as Canada, USA, Argentina or Uruguay joined the party. In the same time, central European countries such as Austria and Hungary also join the internation arena.
# Amongst the late joiners we mostly find tiny countries (Vatican or Comoros) and recent ones (Kosovo or South Sudan). We also find Caribean or northern american islands such as which aren;t countries but collectivies or municipalities of countries such as France or Netherlands. ALthough they are not nations, they competed against other countries either in friendly games or in local tournaments.
df_teams_start %>%
arrange(first_game) %>%
head(10)
## # A tibble: 10 × 2
## teams first_game
## <chr> <dbl>
## 1 England 1872
## 2 Scotland 1872
## 3 Wales 1876
## 4 Northern Ireland 1882
## 5 Canada 1885
## 6 United States 1885
## 7 Argentina 1902
## 8 Austria 1902
## 9 Hungary 1902
## 10 Uruguay 1902
df_teams_start %>%
arrange(first_game) %>%
tail(10)
## # A tibble: 10 × 2
## teams first_game
## <chr> <dbl>
## 1 Surrey 2018
## 2 Yorkshire 2018
## 3 Chameria 2019
## 4 Saint Helena 2019
## 5 Aymara 2022
## 6 Biafra 2022
## 7 Brunei Darussalam 2022
## 8 Mapuche 2022
## 9 Maule Sur 2022
## 10 Yoruba Nation 2022
# We have seen how different teams and continent started to compete one after the others. Let's now see what did this imply for the game itself and its organisation.
# When do games occur?
# Interstingly, the very first games mostly occur on Saturdays but a decent number also took place on Mondays! No game occurred on a Sunday until 1900, potentially for religious purposes but, around the 1910's Sunday was the most common day of the week to see an international game. Other week days, from Tuesday to Friday, weren't an option until later (as late as 1910 for Fridays).
#
# The proportion of games happenning on a given day then changed quite a lot. Wednesdays games became very common and around 30% of the games happened on this day around the year 2000. More recently days such as Tuesday, Thursday or Friday also became more popular.
df_games_per_dayofweek <- df %>%
mutate(year = as.numeric(year)) %>%
filter(year < 2018) %>%
group_by(year, dayofweek) %>%
summarise(n = length(date)) %>%
group_by(year) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(dayofweek = factor(dayofweek, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_dayofweek, aes(x=year, y=perc, colour=dayofweek, group=dayofweek)) +
geom_line() +
facet_wrap(~dayofweek) +
labs(x="Year", y="Percentage of games played") +
guides(colour=FALSE) +
scale_x_continuous(breaks = seq(1870, 2020, 20)) +
scale_y_continuous(breaks = seq(0,100, 10)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Now that we have looked at days, let's check whether some months are more popular for soccer games. The first games mostly occur during Spring months and since then, some month have known some peaks of popularity for intenational games at different period (e.g. many games happened in December in the 1940s).
# In a more recent history, international games became less common in May but more in June.
df_games_per_month <- df %>%
mutate(year = as.numeric(year)) %>%
filter(year < 2018) %>%
group_by(year, month) %>%
summarise(n = length(date)) %>%
group_by(year) %>%
mutate(perc = n / sum(n) * 100) %>%
mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_games_per_month, aes(x=year, y=perc, colour=month, group=month)) +
geom_line() +
facet_wrap(~month) +
labs(x="Year", y="Percentage of games played") +
guides(colour=FALSE) +
scale_x_continuous(breaks = seq(1870, 2020, 20)) +
scale_y_continuous(breaks = seq(0,100, 10)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Evolution of results
# Let' know talk about sport and actual results! First let's check how the proportion of draws and home/away victories evolve through time. Main learnings are:
# * A victory of the home-based team has always been the most likely event.
# * A victory of the visitors is the second most likely outcome, although it tends to decrease in the second half of the 20th century.
# * A draw has always been the least likely outcome, altough it has increased in share since the 1940's.
# It is to be noted that the "home" team isn't always playing on his own country, as for example during world or continental cups.
df_outcome_per_year <- df %>%
mutate(year = as.numeric(year)) %>%
group_by(year, outcome) %>%
summarise(n = length(year)) %>%
group_by(year) %>%
mutate(total_year = sum(n),
perc = n / total_year * 100)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(df_outcome_per_year %>% filter(year > 1900 & year < 2018), aes(x=year, y=perc, group=outcome, colour=outcome)) +
geom_line() +
labs(x="Year", y="Percentage of games", colour="Outcome") +
geom_smooth(se=FALSE, method="loess") +
scale_x_continuous(breaks = seq(1870, 2020, 20))
## `geom_smooth()` using formula = 'y ~ x'

# Let's now get to what is at the heart of soccer: goals! How did this evolve with time?
# Although it started low (the first game resulted in a 0-0 between Scotland and England), then number of goals per games quickly skyrocketed and, before 1900, the average number of goals per game per year could be as high as 8!
# This average then stabilized around 4 until 1950 and then decreased down to 2.5 in a more modern era. The 80's has been the period were games delivered the lowest number of goals.
df_goals_per_game <- df %>%
mutate(year = as.numeric(year)) %>%
group_by(year) %>%
summarise(nb_games = length(year),
nb_goals = sum(home_score + away_score),
goals_per_game = nb_goals / nb_games)
ggplot(df_goals_per_game, aes(x=year, y = goals_per_game)) +
geom_line() +
labs(x="Year", y="", title="Average number of goals per game") +
scale_x_continuous(breaks = seq(1870, 2020, 10))

#Review 2
# # Best performing teams during soccer history
#
# ## Which teams were consistent high scorer and good defender across time?
#
# We have seen how games became globally less prolific in goals, but what about teams? Did some teams always scored a lot or, at contrary, were some always great defenders?
# First, let's transform a bit the data for this purpose. We will now have two entries per game, one from the perspective of each team.
games_info_home <- function(v) {
team1 = v["home_team"]
team1_gf <- v["home_score"]
team1_ga <- v["away_score"]
team1_outcome <- "D"
if (team1_gf > team1_ga) {team1_outcome <- "W"}
if (team1_gf < team1_ga) {team1_outcome <- "L"}
res1 <- c(v["date"], v["year"], v["tournament"], team1, v["away_team"],team1_gf, team1_ga, team1_outcome, "H")
return(res1)
}
games_info_away <- function(v) {
team2 = v["away_team"]
team2_gf <- v["away_score"]
team2_ga <- v["home_score"]
team2_outcome <- "D"
if (team2_gf > team2_ga) {team2_outcome <- "W"}
if (team2_gf < team2_ga) {team2_outcome <- "L"}
res2 <- c(v["date"], v["year"], v["tournament"], team2, v["home_team"],team2_gf, team2_ga, team2_outcome, "A")
return(res2)
}
df_teams_games_home <- t(apply(df, 1, games_info_home))
df_teams_games_away <- t(apply(df, 1, games_info_away))
df_teams_games <- rbind(df_teams_games_home, df_teams_games_away)
colnames(df_teams_games) <- c("date", "year", "tournament", "team", "opponent", "team_score", "opponent_score", "team_outcome", "where")
df_teams_games <- as.data.frame(df_teams_games) %>%
mutate(date=as.Date(date),
year = as.numeric(as.character(year))) %>%
mutate(team_score = as.numeric(as.character(team_score)),
opponent_score = as.numeric(as.character(opponent_score))) %>%
arrange(date)
head(df_teams_games, 10)
## date year tournament team opponent team_score opponent_score
## 1 1872-11-30 1872 Friendly Scotland England 0 0
## 2 1872-11-30 1872 Friendly England Scotland 0 0
## 3 1873-03-08 1873 Friendly England Scotland 4 2
## 4 1873-03-08 1873 Friendly Scotland England 2 4
## 5 1874-03-07 1874 Friendly Scotland England 2 1
## 6 1874-03-07 1874 Friendly England Scotland 1 2
## 7 1875-03-06 1875 Friendly England Scotland 2 2
## 8 1875-03-06 1875 Friendly Scotland England 2 2
## 9 1876-03-04 1876 Friendly Scotland England 3 0
## 10 1876-03-04 1876 Friendly England Scotland 0 3
## team_outcome where
## 1 D H
## 2 D A
## 3 W H
## 4 L A
## 5 W H
## 6 L A
## 7 D H
## 8 D A
## 9 W H
## 10 L A
df_teams_goals_per_year <- df_teams_games %>%
group_by(team, year) %>%
summarise(gf_per_game = sum(team_score) / length(date),
ga_per_game = sum(opponent_score) / length(date),
total_games = length(date))
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
head(df_teams_goals_per_year, 10)
## # A tibble: 10 × 5
## # Groups: team [2]
## team year gf_per_game ga_per_game total_games
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Abkhazia 2012 0.5 2 2
## 2 Abkhazia 2014 1.2 1.2 5
## 3 Abkhazia 2016 3 0.2 5
## 4 Abkhazia 2017 1 1.2 5
## 5 Abkhazia 2018 2.5 0.667 6
## 6 Abkhazia 2019 1.2 0.6 5
## 7 Afghanistan 1941 1 3 1
## 8 Afghanistan 1950 0 4 1
## 9 Afghanistan 1975 0.5 3 6
## 10 Afghanistan 1976 0.667 0.667 3
# When filtering out teams with less tahn 25 games, the name of the most prolific teams overall might surprise you. The top 4 greatest scorer are small oceanian teams such as New Caledonia, Tahiti, Papua New Guinea or Fiji. This is likely explained by these teams mostly competing against other "local" teams in more open games. Amongst the most "conventional" soccer nations, Germany, England and Brazil make it to the podim with, respectively, 2.25, 2.19 and 2.19 goals per game in average during their history.
df_teams_goals_overall <- df_teams_games %>%
group_by(team) %>%
summarise(gf_per_game = sum(team_score) / length(date),
ga_per_game = sum(opponent_score) / length(date),
total_games = length(date))
top10_attack <- head(df_teams_goals_overall %>% filter(total_games > 25) %>% arrange(desc(gf_per_game)), 10) %>% select(team, gf_per_game, total_games)
top10_attack
## # A tibble: 10 × 3
## team gf_per_game total_games
## <chr> <dbl> <int>
## 1 Sápmi 3.23 26
## 2 Isle of Man 3.18 49
## 3 Northern Cyprus 2.88 34
## 4 Padania 2.74 43
## 5 Gotland 2.7 30
## 6 Basque Country 2.69 58
## 7 Tahiti 2.66 213
## 8 New Caledonia 2.64 237
## 9 Isle of Wight 2.41 44
## 10 Papua New Guinea 2.34 122
# The top defenses also offer some surprises. Iran and Morocco have the best defenses with 0.82 and 0.85 goals in average during around 500 games! Spain and Brazil make it to the top 5. Italy, the mother nation of [the Catenaccio](https://en.wikipedia.org/wiki/Catenaccio) closes the top 10.
top10_defense <- head(df_teams_goals_overall %>% filter(total_games > 25) %>% arrange(ga_per_game), 10) %>% select(team, ga_per_game, total_games)
top10_defense
## # A tibble: 10 × 3
## team ga_per_game total_games
## <chr> <dbl> <int>
## 1 Padania 0.767 43
## 2 Iran 0.798 525
## 3 Morocco 0.838 580
## 4 Abkhazia 0.857 28
## 5 Spain 0.892 733
## 6 Brazil 0.894 1021
## 7 South Korea 0.902 905
## 8 Northern Cyprus 0.912 34
## 9 Iraq 0.948 577
## 10 Italy 0.961 838
# If we look at what happened since 1980 only, the picture only changes slightly.
# Top scorer teams are still from Oceania.
# Amongst the best defenses, 6 of the top 10 teams are now from Europe (including all teams from the top 3) and the number of goal against per game has dropped bewteen 0.73 and 0.88. brazil, considered as a very offensive team, still makes it to the top 10.
df_teams_games %>%
filter(year > 1980) %>%
group_by(team) %>%
summarise(gf_per_game = sum(team_score) / length(date),
ga_per_game = sum(opponent_score) / length(date),
total_games = length(date)) %>%
filter(total_games > 25) %>%
arrange(desc(gf_per_game)) %>%
head(10) %>%
select(team, gf_per_game, total_games)
## # A tibble: 10 × 3
## team gf_per_game total_games
## <chr> <dbl> <int>
## 1 Sápmi 3.23 26
## 2 Isle of Man 3.18 49
## 3 Northern Cyprus 3.16 31
## 4 Padania 2.74 43
## 5 Gotland 2.7 30
## 6 New Caledonia 2.60 152
## 7 Isle of Wight 2.41 44
## 8 Basque Country 2.39 31
## 9 Tahiti 2.38 141
## 10 Fiji 2.26 186
df_teams_games %>%
filter(year > 1980) %>%
group_by(team) %>%
summarise(gf_per_game = sum(team_score) / length(date),
ga_per_game = sum(opponent_score) / length(date),
total_games = length(date)) %>%
filter(total_games > 25) %>%
arrange(ga_per_game) %>%
head(10) %>% select(team, ga_per_game, total_games)
## # A tibble: 10 × 3
## team ga_per_game total_games
## <chr> <dbl> <int>
## 1 Brazil 0.723 599
## 2 England 0.734 492
## 3 Northern Cyprus 0.742 31
## 4 Morocco 0.748 441
## 5 Spain 0.759 490
## 6 Iran 0.760 438
## 7 Padania 0.767 43
## 8 France 0.770 492
## 9 Italy 0.781 480
## 10 Netherlands 0.829 451
# Let's look at how the defense and offense skills of these teams have evolved through time.
#
# Some of the best socring teams are on a declining trend, such as Hungary, Tahiti or Papua New Guinea. However, other teams such as Germany, Brazil or Fiji are very stable, which is remakable as, as seen before, the overall number of goals per game is decreasing.
#
# The best defending teams are following the global trend of games delivering less goals and are generallty taking less goals too.
ggplot(top10_attack %>% select(team) %>% left_join(df_teams_goals_per_year, by="team"),
aes(x=year, y=gf_per_game, colour=team)) +
geom_line() +
facet_wrap(~team) +
labs(x="Year", y="Goal scored per game") +
guides(colour=FALSE) +
geom_smooth(method="lm")
## Warning in left_join(., df_teams_goals_per_year, by = "team"): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
## `geom_smooth()` using formula = 'y ~ x'

ggplot(top10_defense %>% select(team) %>% left_join(df_teams_goals_per_year, by="team"),
aes(x=year, y=ga_per_game, colour=team)) +
geom_line() +
facet_wrap(~team) +
labs(x="Year", y="Goal against per game") +
guides(colour=FALSE) +
geom_smooth(method="lm")
## Warning in left_join(., df_teams_goals_per_year, by = "team"): Each row in `x` is expected to match at most 1 row in `y`.
## ℹ Row 1 of `x` matches multiple rows.
## ℹ If multiple matches are expected, set `multiple = "all"` to silence this
## warning.
## `geom_smooth()` using formula = 'y ~ x'

# ## Defense and attack per decade
#
# We have looked at defense and attack overall but it is very likely that the best defending and attacking countries haven't always been the same. So let's break this down by decade.
df_teams_goals_per_decade <- df_teams_games %>%
mutate(decade = cut(year, seq(1870,2020, 10), dig.lab = 4, right=FALSE)) %>%
group_by(team, decade) %>%
summarise(gf_per_game = sum(team_score) / length(date),
ga_per_game = sum(opponent_score) / length(date),
total_games = length(date),
min_year = min(year)) %>%
ungroup() %>%
group_by(decade) %>%
mutate(min_year = min(min_year),
decade_year = paste(min_year, "'s", sep=""))
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
#group_by(decade) %>%
#top_n(n=6, wt=winrate) %>%
#ungroup() %>%
#arrange(desc(decade), desc(winrate)) %>%
#mutate(ord = rev(row_number())) %>%
#mutate(decade_year = paste(min_year, "'s", sep=""))
df_teams_goals_per_decade_top_gf <- df_teams_goals_per_decade %>%
group_by(decade_year) %>%
filter(total_games > 10) %>%
top_n(n=6, wt=gf_per_game) %>%
ungroup() %>%
arrange(desc(decade_year), desc(gf_per_game)) %>%
mutate(ord = rev(row_number()))
# Best scoring teams have changed quite a lot through the different decades in soccer history. Some of the lessons we can learn are:
#
# * Scotland once, was one of the top scoring nations (OK, that was when max 10 teams were competing, but still) and slowly dropped from the top 6.
# * Sweden was consistently in the top 6 for 4 decades in a row (1910s to 1940s).
# * Fiji and Tahiti were at the top of the charts during some decades too, including some recent ones.
# * Zambia and China once were among the top scorers.
# * During the last 3 decades, Germany and Spain are the only major nations who made it twice to the top 6.
ggplot(df_teams_goals_per_decade_top_gf, aes(x=ord, y=gf_per_game, fill=team)) +
geom_bar(stat="identity") +
facet_wrap(~decade_year, scales="free_y") +
coord_flip() +
scale_x_continuous(labels=df_teams_goals_per_decade_top_gf$team,
breaks=df_teams_goals_per_decade_top_gf$ord) +
labs(x="", y="Goals scored per game") +
guides(fill=FALSE)

# Let's look at defenses now. Here is what we can see:
# * Scotland also used to have a good defense.
# * England and Germany were solid during the 1930's and 1940's.
# * China and Tahiti were amongst the best defenses between the 1960's and 1980's.
# * Despite of being seen as an offensive team, Brazil was #1 and #3 best defense in the 1980's and 1990's.
# * Germany was the second best defense two decades in a row (2000's and 2010's)
df_teams_goals_per_decade_top_ga <- df_teams_goals_per_decade %>%
group_by(decade_year) %>%
filter(total_games > 10) %>%
top_n(n=6, wt=gf_per_game) %>%
ungroup() %>%
arrange(desc(decade_year), ga_per_game) %>%
mutate(ord = rev(row_number()))
ggplot(df_teams_goals_per_decade_top_ga, aes(x=ord, y=ga_per_game, fill=team)) +
geom_bar(stat="identity") +
facet_wrap(~decade_year, scales="free_y") +
coord_flip() +
scale_x_continuous(labels=df_teams_goals_per_decade_top_ga$team,
breaks=df_teams_goals_per_decade_top_ga$ord) +
labs(x="", y="Goal against per game") +
guides(fill=FALSE)

# Are defense and attack correlated, i.e. are the top scorers also the best defense?
# Below, we can see that the teams scoring very few goals per game are also more likely to have a poorer defense. However, pat a given limit around 1.5 goals for per game, the quality of the defense remains rather constant.
# In general, teams above the line generally have a bad defense given their attack level and teams below the line have a better defense given their attack stats.
ggplot(df_teams_goals_per_decade, aes(x=gf_per_game, y=ga_per_game, colour=decade_year)) +
geom_point() +
geom_smooth(aes(group=1)) +
labs(x="Goals for", y="Goals against", colour="Decade")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?

# Looking a decades separetely, we can see that this trend holds true most of the time, with some variations. For example, the relation was more flat in the 70's but is more pronounced in the most recent decades.
ggplot(df_teams_goals_per_decade %>% filter(min_year > 1900), aes(x=gf_per_game, y=ga_per_game, colour=decade_year)) +
geom_point(size=0.5) +
facet_wrap(~decade_year, scales="free") +
geom_smooth(aes(group=1), method="loess") +
labs(x="Goals for", y="Goals against") +
guides(colour=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# ## Overall, which team has the best win ratio?
#
# Now that we have looked at attack and defense, let's move to what finally matters the most: winning. It can be seen as fair to say that the most dominating team is the one that wins the highest number of games. Let's then compute the win ratio of all teams.
# Number of games per year per team
df_team_games_per_year <- all_teams %>%
filter(year < 2018) %>%
group_by(teams, year) %>%
summarise(nb_games = length(year)) %>%
mutate(year_date=as.Date(paste(year,"-01-01",sep="")))
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
# Number of victories per year
df_nb_victories <- df %>%
mutate(year=as.numeric(year)) %>%
select(year, winning_team) %>%
filter(!is.na(winning_team)) %>%
group_by(year, winning_team) %>%
summarise(nb_victories = length(winning_team))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Number of losses per year
df_nb_losses <- df %>%
mutate(year=as.numeric(year)) %>%
select(year, losing_team) %>%
filter(!is.na(losing_team)) %>%
group_by(year, losing_team) %>%
summarise(nb_losses = length(losing_team))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Putting all this together
df_teams_winrate <- df_team_games_per_year %>%
left_join(df_nb_victories, by=c("year"="year", "teams"="winning_team")) %>%
left_join(df_nb_losses, by=c("year", "teams"="losing_team")) %>%
mutate(nb_victories = ifelse(is.na(nb_victories), 0, nb_victories)) %>%
mutate(nb_losses = ifelse(is.na(nb_losses), 0, nb_losses)) %>%
mutate(nb_ties = nb_games - (nb_victories + nb_losses))
# Let's look overall
df_teams_winrate_overall <- df_teams_winrate %>%
group_by(teams) %>%
summarise(nb_games = sum(nb_games),
nb_victories = sum(nb_victories),
nb_losses = sum(nb_losses),
nb_ties = sum(nb_ties)) %>%
ungroup() %>%
mutate(winrate = nb_victories / nb_games * 100,
lossrate = nb_losses / nb_games * 100,
tierate = nb_ties / nb_games * 100)
# We will remove teams who played less than 10 games in total as they might have rather random win ratios (otherwise, the top 2 teams have a 100% win rate and... 1 game only).
# This time, the top teams are not a surprise: Brazil, Germany and Spain. Some teams are more surprising such as Jersey or Northern Cyprus. Together with Brazil, Argentina and Iran are the only non-European countries in this top 10. Czech Republic and Croatia also make it to this top 10.
df_teams_winrate_overall %>%
filter(nb_games > 10) %>%
arrange(desc(winrate)) %>%
head(n=10)
## # A tibble: 10 × 8
## teams nb_games nb_victories nb_lo…¹ nb_ties winrate lossr…² tierate
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Padania 32 23 3 6 71.9 9.38 18.8
## 2 Jersey 78 52 15 11 66.7 19.2 14.1
## 3 Basque Country 55 36 9 10 65.5 16.4 18.2
## 4 Brazil 957 607 156 194 63.4 16.3 20.3
## 5 Andalusia 13 8 1 4 61.5 7.69 30.8
## 6 Rhodes 18 11 5 2 61.1 27.8 11.1
## 7 Northern Cyprus 28 17 7 4 60.7 25 14.3
## 8 Germany 927 545 193 189 58.8 20.8 20.4
## 9 Spain 670 391 128 151 58.4 19.1 22.5
## 10 Isle of Man 45 26 15 4 57.8 33.3 8.89
## # … with abbreviated variable names ¹nb_losses, ²lossrate
df_teams_winrate_overall_mold <- df_teams_winrate_overall %>%
filter(nb_games > 10) %>%
arrange(desc(winrate)) %>%
mutate(teams = factor(teams, levels=teams[order(winrate)])) %>%
head(n=10) %>%
select(teams, winrate, lossrate, tierate) %>%
melt(id.vars="teams")
ggplot(df_teams_winrate_overall_mold, aes(x = teams, y=value, fill=variable, group=teams)) +
geom_bar(stat="identity") +
coord_flip() +
labs(x="", y="Percentage", fill="", title="Top 10 teams by overall win rate")

# Now that we looked at the best teams, which are the ones with the lowest win ratio? Nations with less than 10 games played are filtered out.
#
# Without great surprise, they are mostly small nations. Kiribati is the only of those nations who never won a game and their unique draw game is from 1979 and, ironically, it is not lised in their [Wikipedia page](https://en.wikipedia.org/wiki/Kiribati_national_football_team) (but their 24-0 defeat to Fiji is).
#
df_teams_winrate_overall %>%
filter(nb_games > 10) %>%
arrange(winrate) %>%
head(n=10)
## # A tibble: 10 × 8
## teams nb_games nb_victories nb_los…¹ nb_ties winrate lossr…² tierate
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Kiribati 11 0 10 1 0 90.9 9.09
## 2 San Marino 147 1 142 4 0.680 96.6 2.72
## 3 Andorra 145 4 129 12 2.76 89.0 8.28
## 4 Djibouti 83 3 76 4 3.61 91.6 4.82
## 5 Anguilla 51 3 45 3 5.88 88.2 5.88
## 6 Luxembourg 381 28 306 47 7.35 80.3 12.3
## 7 Liechtenstein 180 14 144 22 7.78 80 12.2
## 8 Timor-Leste 51 4 44 3 7.84 86.3 5.88
## 9 Somalia 104 9 83 12 8.65 79.8 11.5
## 10 American Samoa 43 4 38 1 9.30 88.4 2.33
## # … with abbreviated variable names ¹nb_losses, ²lossrate
df %>%
filter(away_team == "Kiribati" | home_team == "Kiribati")
## # A tibble: 11 × 15
## date home_team away_…¹ home_…² away_…³ tourn…⁴ city country neutral
## <date> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <lgl>
## 1 1979-08-30 Fiji Kiriba… 24 0 South … Naus… Fiji FALSE
## 2 1979-08-31 Kiribati Papua … 0 13 South … Suva Fiji TRUE
## 3 1979-09-05 Kiribati Tuvalu 3 3 South … Naus… Fiji TRUE
## 4 2003-06-30 Tuvalu Kiriba… 3 2 South … Suva Fiji TRUE
## 5 2003-07-03 Solomon Isl… Kiriba… 7 0 South … Suva Fiji TRUE
## 6 2003-07-05 Fiji Kiriba… 12 0 South … Naus… Fiji FALSE
## 7 2003-07-07 Kiribati Vanuatu 0 18 South … Laut… Fiji TRUE
## 8 2011-08-30 Fiji Kiriba… 9 0 Pacifi… Boul… New Ca… TRUE
## 9 2011-09-01 Cook Islands Kiriba… 3 0 Pacifi… Boul… New Ca… TRUE
## 10 2011-09-03 Kiribati Papua … 1 17 Pacifi… Boul… New Ca… TRUE
## 11 2011-09-05 Kiribati Tahiti 1 17 Pacifi… Boul… New Ca… TRUE
## # … with 6 more variables: year <chr>, month <chr>, dayofweek <chr>,
## # outcome <chr>, winning_team <chr>, losing_team <chr>, and abbreviated
## # variable names ¹away_team, ²home_score, ³away_score, ⁴tournament
# Now that we have seen top and struggling team, let's check if some teams are best at draw games. Here again, only nations with more than 10 games were considered.
#
# Interestingly, 8 out of the top 10 teams in term of ties are from Africa. The numbers aren't extreme though as Angola, the top team in this ranking, has a nearly 35% tie rate, which is not far from what would be the expectation if the outcome of a game was purely random.
df_teams_winrate_overall %>%
filter(nb_games > 10) %>%
arrange(desc(tierate)) %>%
head(n=10)
## # A tibble: 10 × 8
## teams nb_games nb_victories nb_lo…¹ nb_ties winrate lossr…² tierate
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Abkhazia 17 6 3 8 35.3 17.6 47.1
## 2 Angola 345 117 108 120 33.9 31.3 34.8
## 3 Lesotho 235 42 120 73 17.9 51.1 31.1
## 4 Andalusia 13 8 1 4 61.5 7.69 30.8
## 5 Ellan Vannin 13 5 4 4 38.5 30.8 30.8
## 6 Cameroon 527 231 135 161 43.8 25.6 30.6
## 7 Botswana 259 67 115 77 25.9 44.4 29.7
## 8 Iraqi Kurdistan 27 13 6 8 48.1 22.2 29.6
## 9 Jordan 368 127 133 108 34.5 36.1 29.3
## 10 Lebanon 253 69 110 74 27.3 43.5 29.2
## # … with abbreviated variable names ¹nb_losses, ²lossrate
# ## Which are the best teams per decade
#
# That's the question we all want to see answered! It came to no surprise that Brazil or Germany have the highest win ratios, but was it always the case? Which teams dominated the different eras of football.
df_teams_winrate_per_decade <- df_teams_winrate %>%
mutate(decade = cut(year, seq(1870,2020, 10), dig.lab = 4, right=FALSE)) %>%
group_by(teams, decade) %>%
summarise(nb_games = sum(nb_games),
nb_victories = sum(nb_victories),
nb_losses = sum(nb_losses),
nb_ties = sum(nb_ties),
min_year = min(year)) %>%
ungroup() %>%
mutate(winrate = nb_victories / nb_games * 100,
lossrate = nb_losses / nb_games * 100,
tierate = nb_ties / nb_games * 100)
## `summarise()` has grouped output by 'teams'. You can override using the
## `.groups` argument.
df_teams_winrate_per_decade_cleaned <- df_teams_winrate_per_decade %>%
filter(nb_games > 10) %>%
group_by(decade) %>%
mutate(min_year = min(min_year)) %>%
top_n(n=6, wt=winrate) %>%
ungroup() %>%
arrange(desc(decade), desc(winrate)) %>%
mutate(ord = rev(row_number()))
# Here are some interesting findings we can collect form the plot below:
#
# * England used to be on the top teams. There were little competition at the time but there almost always in the top 6 teamsuntil the 1960's but were never back to this eleite club since then.
# * Egypt or Iran are some made it twice to the top 6 since the 1970's.
# * More or less expectedly, Argentina and Korea also are some of the teams regularly present in the top 5.
# * Brazil has consistently been in the top 5 for the last 8 decades, that's the most striking performance and a strong indicator that they have been the most regular team in the (semi-) recent history of soccer. They occupied the top spot three decades in a row, from the 1970's to the 1990's.
# * Germany can be considered as the second most regular team, making it 7 times in the top 5 in the last 9 decades. However, they never reached the first position.
# * Spain's domination in the recent history of football is clearly visible here as they have occupied the top spot of this ranking during the last two decades (although the current one is yet to be finised).
df_teams_winrate_per_decade_cleaned2 <- df_teams_winrate_per_decade %>%
filter(nb_games > 25) %>%
group_by(decade) %>%
mutate(min_year = min(min_year)) %>%
top_n(n=6, wt=winrate) %>%
ungroup() %>%
arrange(desc(decade), desc(winrate)) %>%
mutate(ord = rev(row_number())) %>%
mutate(decade_year = paste(min_year, "'s", sep=""))
ggplot(df_teams_winrate_per_decade_cleaned2, aes(x=ord, y=winrate, fill=teams)) +
geom_bar(stat="identity") +
facet_wrap(~decade_year, scales="free_y") +
coord_flip() +
scale_x_continuous(labels=df_teams_winrate_per_decade_cleaned2$teams,
breaks=df_teams_winrate_per_decade_cleaned2$ord) +
labs(x="", y="Win rate (%)", title="Top 6 best soccer teams per decade") +
guides(fill=FALSE) + theme(axis.text.x=element_text(size=6))

# ## How did the hierarchy between continent evolve?
#
# The previous plots showed that the dominating nations were often European or... Brazil. But if we look at whole continents, what happens? Are some continents really dominating?
#
# Continents are defined as local associations such as UEFA or AFC. America is the combination of CONCACAF and CONMEBOL). Australia is part of AFC and, therefore, considered as part of Asia in this case.
#
# From the two plots below, we can see that globally, median win ratio is similar between continents although the spread might vary a lot. For example, Europe often ahs some of the best teams but also some of the least performing (generally small states such as Andorra, Luxembourg,...). In comparison, the Americas, generally show a more homogeneous distribution.
# Oceania has apparently a high win rate but there are mostly small teams competing between each others (Australia is part of the AFC, i.e. Asia).
# Africa, seems to be the poorest performing continent overall, at least in the last decades.
#df_teams_winrate_per_decade_cleaned2_per_continent <- df_teams_winrate_per_decade %>%
# filter(nb_games > 25) %>%
# group_by(decade) %>%
# mutate(min_year = min(min_year)) %>%
# ungroup() %>%
# arrange(desc(decade), desc(winrate)) %>%
# mutate(ord = rev(row_number())) %>%
# mutate(decade_year = paste(min_year, "'s", sep=""))
# ggplot(df_teams_winrate_per_decade_cleaned2_per_continent, aes(x=continent, y=winrate, colour=continent)) +
# geom_jitter(position=position_jitter(0.2), size=0.5) +
# facet_wrap(~ decade_year, scales = "free") +
# coord_flip() +
# stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
# geom = "crossbar", size = 0.2, width=0.2, colour="black") +
# labs(x="", y="Win rate (%)") +
# guides(colour=FALSE)
# %% [code]
#p <- ggplot(df_teams_winrate_per_decade_cleaned2_per_continent, aes(x=continent, y=winrate, colour=continent)) +
#geom_jitter(position=position_jitter(0.2), size=0.5, aes(text=teams)) +
#facet_wrap(~ decade_year, scales = "free") +
#coord_flip() +
#stat_summary(fun.y = median, fun.ymin = median, fun.ymax = median,
# geom = "crossbar", size = 0.2, width=0.2, colour="black") +
#labs(x="", y="Win rate (%)") +
#guides(colour=FALSE)
#ggplotly(p)
# df_teams_median_winrate_per_decade_per_continent <- df_teams_winrate_per_decade_cleaned2_per_continent %>%
# group_by(continent, min_year, decade_year) %>%
# summarise(median = median(winrate))
# ggplot(df_teams_median_winrate_per_decade_per_continent, aes(x=min_year, y=median, group=continent, colour=continent)) +
# geom_line() +
# labs(x="Decade", title="Median win rate", colour="Continent", y="%")
# ## Identifying streaks
#
# Another way to look at domination at a more fine-grained level is to look at winning streaks, in particular between two different teams.
# First quick look at the data, which are the most common games?
#
# The most common games oppose neighbouring countries. This makes sense as in the early history of soccer, it was more convenient to play against near by countries.
df_streaks <- df_teams_games %>%
arrange(team, opponent, date)
get_first <- function(df) {return(df[1,])}
df_streaks %>%
mutate(team = as.character(team), opponent = as.character(opponent)) %>%
group_by(team, opponent) %>%
summarise(n = length(team)) %>%
rowwise() %>%
mutate(key = paste(min(c(team, opponent)), max(c(team, opponent)), sep="vs")) %>%
ungroup() %>%
group_by(key) %>%
do(head(.,1)) %>%
ungroup() %>%
arrange(desc(n)) %>%
head(10)
## `summarise()` has grouped output by 'team'. You can override using the
## `.groups` argument.
## # A tibble: 10 × 4
## team opponent n key
## <chr> <chr> <int> <chr>
## 1 Argentina Uruguay 179 ArgentinavsUruguay
## 2 Austria Hungary 137 AustriavsHungary
## 3 Belgium Netherlands 127 BelgiumvsNetherlands
## 4 England Scotland 117 EnglandvsScotland
## 5 Kenya Uganda 110 KenyavsUganda
## 6 Norway Sweden 109 NorwayvsSweden
## 7 Argentina Brazil 108 ArgentinavsBrazil
## 8 Denmark Sweden 107 DenmarkvsSweden
## 9 Scotland Wales 106 ScotlandvsWales
## 10 Argentina Paraguay 105 ArgentinavsParaguay
# Let's now identify, which teams have been consistently dominating one of their opponent. I define as streak a series of at least 6 games during which a given team never lost (draws are possible).
# Argentina had the longest streak ever observed against Chile. During 49 years and 35 games, Chile never won against Argentina. However, Chile can still brag of its series of 19 undefeated games against Ecuador.
# Some of these streaks are relatively old but some also ended during the last years (or are still ongoing) and they generally gather teams of the same continent, or even neighbouring countries.
## This chunk is a bit slow, I should see if there are ways to speed it up.
## Suggestions are welcome
## At the moment, the output is cached (as a dataset) to avoid regenerating it each time.
## TODO:
### Flag streaks which are still ongoing
# extract_streaks <- function(dfs, lvls, min_streak = 6, min_games = 6) {
# #print(dfs)
# if (length(dfs$team_outcome) >= min_games) {
# outcomes <- paste(dfs$team_outcome, collapse="")
# streaks <- strsplit(outcomes, "L")[[1]]
# streaks_length <- nchar(streaks)
# last_longest_streak <- max(which(streaks_length == max(streaks_length)))
# last_longest_streak_length = max(streaks_length)
# if (last_longest_streak_length >= min_streak) {
# streak_begin <- sum(streaks_length[1:last_longest_streak]) + last_longest_streak - last_longest_streak_length
# streak_end <- streak_begin + last_longest_streak_length - 1
# streak_df <- dfs[streak_begin:streak_end,]
# streak_date_start = as.character(streak_df[1,"date"])
# streak_date_end = as.character(streak_df[last_longest_streak_length,"date"])
#
# rm(dfs)
# gc()
#
# if(length(streak_date_start) > 0 & length(streak_date_end) > 0 & last_longest_streak_length > 0) {
# res <- data.frame(start_date = factor(streak_date_start, levels=lvls), end_date = factor(streak_date_end, levels=lvls), len = last_longest_streak_length)
# return(res)
# } else {
# return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
# }
# } else {
# return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
# }
# } else {
# return(data.frame(start_date = factor(character(0), levels=lvls), end_date = factor(character(0), levels=lvls), len = numeric(0)))
# }
#
#
# }
#
# save_file = "../input/international-soccer-games-streaks/df_top_streak.RData"
# if (!file.exists(save_file)) {
# df_top_streak <- df_streaks %>%
# group_by(team, opponent) %>%
# do(extract_streaks(., lvls = unique(df_streaks$date)))
# save(df_top_streak, file=save_file)
# } else {
# load(save_file)
# }
#
# df_top_streak %>%
# arrange((desc(len))) %>%
# head(20)
# Below are listed all the games from the Argentina's undefeated streak against Chile.
## Checking one individual example
df_streaks %>%
filter(team == "Argentina" & opponent == "Chile" & date >= as.Date("1910-05-27") & date <= as.Date("1959-03-07"))
## date year tournament team opponent team_score
## 1 1910-05-27 1910 Friendly Argentina Chile 3
## 2 1910-06-05 1910 Friendly Argentina Chile 5
## 3 1910-09-11 1910 Friendly Argentina Chile 3
## 4 1913-09-21 1913 Friendly Argentina Chile 2
## 5 1916-07-06 1916 Copa América Argentina Chile 6
## 6 1916-07-12 1916 Friendly Argentina Chile 1
## 7 1917-10-06 1917 Copa América Argentina Chile 1
## 8 1919-05-22 1919 Copa América Argentina Chile 4
## 9 1920-09-20 1920 Copa América Argentina Chile 1
## 10 1922-09-28 1922 Copa América Argentina Chile 4
## 11 1922-10-22 1922 Friendly Argentina Chile 1
## 12 1924-10-25 1924 Copa América Argentina Chile 2
## 13 1926-10-31 1926 Copa América Argentina Chile 1
## 14 1930-07-22 1930 FIFA World Cup Argentina Chile 3
## 15 1935-01-06 1935 Copa América Argentina Chile 4
## 16 1936-12-30 1936 Copa América Argentina Chile 2
## 17 1940-03-02 1940 Friendly Argentina Chile 4
## 18 1940-03-09 1940 Friendly Argentina Chile 3
## 19 1941-01-05 1941 Friendly Argentina Chile 2
## 20 1941-01-09 1941 Friendly Argentina Chile 5
## 21 1941-03-04 1941 Copa América Argentina Chile 1
## 22 1942-01-31 1942 Copa América Argentina Chile 0
## 23 1945-02-11 1945 Copa América Argentina Chile 1
## 24 1946-01-26 1946 Copa América Argentina Chile 3
## 25 1947-12-16 1947 Copa América Argentina Chile 1
## 26 1955-03-30 1955 Copa América Argentina Chile 1
## 27 1956-01-29 1956 Copa América Argentina Chile 2
## 28 1956-03-11 1956 Pan American Championship Argentina Chile 3
## 29 1957-03-28 1957 Copa América Argentina Chile 6
## 30 1957-10-13 1957 FIFA World Cup qualification Argentina Chile 2
## 31 1957-10-20 1957 FIFA World Cup qualification Argentina Chile 4
## 32 1959-03-07 1959 Copa América Argentina Chile 6
## opponent_score team_outcome where
## 1 1 W H
## 2 1 W H
## 3 0 W A
## 4 0 W A
## 5 1 W H
## 6 0 W H
## 7 0 W H
## 8 1 W H
## 9 1 D A
## 10 0 W H
## 11 0 W H
## 12 0 W H
## 13 1 D A
## 14 1 W H
## 15 1 W H
## 16 1 W H
## 17 1 W H
## 18 2 W H
## 19 1 W A
## 20 2 W A
## 21 0 W A
## 22 0 D H
## 23 1 D A
## 24 1 W H
## 25 1 D H
## 26 0 W A
## 27 0 W H
## 28 0 W H
## 29 2 W H
## 30 0 W A
## 31 0 W H
## 32 1 W H
# # Bonus track: Evolution of intercontinental games
#
# Earlier, we saw how different teams and continent start to engage in international games. However, teams tend to start playing against their neighbours. So, when did inter-continental games start and, by extension, soccer became an intercontinental game.
#
# Games are considered as intercontinental if they oppose two teams from different associations (e.g AFC and UEFA). Australia is part of the AFC but was relocated to Oceania for this analysis. The American continent is the union of the CONMEBOL and CONCACAF associations.
# We add some features to our data frame so that we can know whether a game is intercontinental.
# df_teams_games_extended <- df_teams_games %>%
# filter(where=="H") %>%
# #inner_join(df_federations %>% select(country, continent), by=c("team"="country")) %>%
# #inner_join(df_federations %>% select(country, continent), by=c("opponent"="country")) %>%
# rename(continent_home = continent.x,
# continent_away = continent.y) %>%
# mutate(continent_home = ifelse(team=="Australia", "Oceania", continent_home),
# continent_away = ifelse(opponent=="Australia", "Oceania", continent_away)) %>%
# rowwise() %>%
# mutate(intercontinental = (continent_home != continent_away)) %>%
# ungroup()
#
# tail(df_teams_games_extended)
# The first intercontinental game occurred in 1888 and opposed Scotland to Canada (4-0), 16 years after the very first international game. Notably, Scotland was already involved in the first international game. The next two other intercontinental games happened way later, in 1916 when, in the span of two weeks, the USA played against Sweden and Norway. Between the 1920's and 1940's, intercontinental games became more regular though still sparse (some years didn't count any).
# The end of WWII will mark the beginning of global soccer as, since 1946, there were at least one intercontinental game per year. The percentage of intercontinental game kept increasing until the 1990's, when, in average, about 15% of the games opposed teams of different continent.
# Since then, there ha been a slight decrease of intercontinental games.
#
# There are some visible peaks of interncontinental games which coincide with World Cup years. This makes sense as, by design, teams will face teams of other continents and also take profit of pre-competition friendly games to gauge their level against a wide range of teams from different continents.
# df_intercontinental_games_per_year <- df_teams_games_extended %>%
# filter(year < 2018) %>%
# group_by(year) %>%
# summarise(nb_inter = sum(intercontinental),
# perc_inter = nb_inter / length(intercontinental) * 100) %>%
# ungroup() %>%
# mutate(worldcup_year = year %in% wc_years)
# ggplot(df_intercontinental_games_per_year, aes(x=year, y=perc_inter)) +
# geom_line() +
# geom_point(data = df_intercontinental_games_per_year %>% filter(worldcup_year), aes(colour=worldcup_year)) +
# geom_smooth(method="loess") +
# labs(x="Year", title="% of intercontinental games", y="%", colour="World cup year?") +
# scale_x_continuous(breaks = seq(1870,2020,10))
### When did the first intercontinental games happen?
# df_teams_games_extended %>% filter(intercontinental) %>% head(5)
# Let's have a closer look and see what happens at a continent level.
#
# Oceania has been the continent most often involved in intercontinental games. In the early years it is due to many games being organised between New Zealand, Australia, India. Canada and South Africa, all Commonwealth countries. In the latest years, this likely due to the fact that Australia is affiliated to the AFC and then plays qualification rounds against Asian teams.
# Europe has for a long time being the continent the least involved in intercontinental games, this is consistent with this continent hosting most of the oldest soccer teams.
# Africa was initially involved in many intercontinental games but is now the continent whose teams travel the least.
# Americ and Asia have similar trajectories: they used to play many games against teams from other continents but now are playing much more often against "local" opponents.
# df_intercontinental_games_per_year_per_continent <- df_teams_games_extended %>%
# filter(year < 2018) %>%
# group_by(year, continent_home) %>%
# summarise(nb_inter = sum(intercontinental),
# perc_inter = nb_inter / length(intercontinental) * 100)
#
# ggplot(df_intercontinental_games_per_year_per_continent, aes(x=year, y=perc_inter, group=continent_home, colour=continent_home)) +
# geom_smooth(method="loess") +
# labs(x="Year", title="% of intercontinental games", y="%", colour="Team's continent") +
# scale_x_continuous(breaks = seq(1870,2020,10))
# List of early games involving teams from Oceania
# df_teams_games_extended %>%
# filter(( continent_home == "Oceania" | continent_away == "Oceania") & year >= 1922 & year < 1950)
# Looking at how Australia is responisible for the high number of intercontinental games for Oceania.
# df_intercontinental_games_per_year_oceania <- df_teams_games_extended %>%
# filter(continent_home == "Oceania" | continent_away == "Oceania") %>%
# mutate(isAustralia = (team == "Australia" | opponent == "Australia")) %>%
# filter(year < 2018) %>%
# group_by(year, isAustralia) %>%
# summarise(nb_inter = sum(intercontinental),
# perc_inter = nb_inter / length(intercontinental) * 100)
# ggplot(df_intercontinental_games_per_year_oceania, aes(x=year, y=perc_inter, group=isAustralia, colour=isAustralia)) +
# geom_smooth(method="loess") +
# scale_x_continuous(breaks = seq(1870,2020,10)) +
# labs(x="Year", title="% of intercontinental games\nOceania only", y="%", colour="Team's continent")